In [306]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
In [307]:
# Showing matplotlib plots in jupyter notebook
%matplotlib inline
In [308]:
# Getting Training dataset ( Variant #1 )
# Importing data as Dataframe
df = pd.read_csv('../Dataset/Dataset/Training/Features_Variant_1.csv',sep=',', header=None)
# Importing data as numpy array
X = np.genfromtxt('../Dataset/Dataset/Training/Features_Variant_1.csv', delimiter=",")
In [309]:
y = X[:,53]
In [310]:
def matrixToNumber(df):
# Checking df type that whether is Dataframe or not!
if ( type(df) == pd.core.frame.DataFrame ):
df_temp = np.ones((df.shape[0],1), dtype='int')
# Converting df to Dataframe if df is Array or Series
elif ( type(df) == numpy.ndarray or type(df) == pd.core.frame.Series ):
df = pd.DataFrame(df)
df_temp = np.ones((df.shape[0],1), dtype='int')
else:
return "Function expect Dataframe"
# Making columns labels to start from 0
df.columns = df.columns - df.columns[0]
# Finding day of week post had published
for a in df.columns:
df_temp[df[a] == 1] = int(df.shape[1]) - int(a) - 1
# returning numpy array for easier later uses
return np.array(df_temp).T[0]
# Drop column with integer label
def drop_int(df, val):
# Checking df type that whether is Dataframe or not!
if ( type(df) == pd.core.frame.DataFrame ):
df = df.T.drop(val,axis=0).T
df.rename(columns=(lambda x: ( x - 1 if x > val else x)),inplace=True)
return df
else:
return "Function expect Dataframe"
def plotLearningCurves(X,y,step):
m,n = X.shape
maxVal = (int)(m / 10) * 10
N_size_arr = np.arange(10, maxVal + 10, step)
error_arr = np.zeros(( len(np.arange(10, maxVal + 10, step)) ,2 ))
index = 0
# Increasing train dataset size, "step" times in each iteration
for i in N_size_arr:
# Splitting Training dataset with size i into train and cross validation sets
X_train, X_test, y_train, y_test = train_test_split(X[:i,:], y[:i], test_size=0.33, random_state=42)
# Fitting Model
lm.fit(X_train, y_train)
# Computing both mean squared error of training dataset and cross validation datasets predections
error_arr[index,0] = mean_squared_error(y_train , lm.predict(X_train))
error_arr[index,1] = mean_squared_error(y_test, lm.predict(X_test))
# Increasing index with 1
index += 1
# Initializing the figure
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])
ax.set_yscale('log')
# Plotting "Training set size" vs. "Mean Squared Error" for both of the train and cross validation dataset's errors
line1, = ax.plot(N_size_arr,error_arr[:,0], c='red')
line2, = ax.plot(N_size_arr,error_arr[:,1], c='blue')
# Adding labels && legends to our plot
ax.set_xlabel("N (Training set size)")
ax.set_ylabel("Mean Squared Error")
ax.legend((line1,line2),("Train Error","Test Error"))
In [311]:
a = np.ones((12))
type(a)
Out[311]:
In [312]:
# Getting first 5 training data in dataset
df.head(10)
Out[312]:
In [313]:
# example of numpy "arr" numpy array
X[0]
Out[313]:
In [314]:
# Disovering more information about dataset
df.info()
In [315]:
df.describe()
Out[315]:
In [316]:
# Getting unique values of "H Local" Feature
np.unique(X[:,38])
Out[316]:
In [317]:
# Import preprocessing functions in sklearn
from sklearn import preprocessing
# Normalize dataset features
X_norm = preprocessing.normalize(X[:,:53],axis= 1, copy= True)
X_temp = X.copy()
X_temp[:,0:53] = X_norm
X_temp[:,53] = X[:,53]
X_norm = X_temp
# Converting X_norm to Dataframe
df_norm = pd.DataFrame(X_norm)
In [318]:
# Analysis data after normalization
df_norm.describe()
Out[318]:
In [319]:
# We use this column when we want to plot "Average of Total Comments" vs. "Pages Likes" and etc.
temp = np.mean(X_norm[:,31:34],axis=1)
# Inserting new column before target column
X_norm = np.insert(arr=X_norm,obj=53,values=temp, axis=1)
# Converting new created Matrix to dataframe
df_norm = pd.DataFrame(X_norm)
In [320]:
weekday = matrixToNumber(df.T[39:46].T)
X = np.insert(arr=X,obj=53,values=weekday, axis=1)
df = pd.DataFrame(X)
In [321]:
fig = plt.figure()
axes1 = fig.add_axes([0, 2, 0.8, 0.8])
axes2 = fig.add_axes([1,2,0.8,0.8])
axes3 = fig.add_axes([0,1,0.8,0.8])
axes4 = fig.add_axes([1,1,0.8,0.8])
axes5 = fig.add_axes([0,0,0.8,0.8])
# "Pages Likes/Popularity" Vs. "Pages Category"
axes1.plot(X_norm[:,35], X_norm[:,0], marker='o', markersize=5, lw= 0)
axes1.set_xlabel("Post Length")
axes1.set_ylabel("Post Likes/Popularity")
# =====> We can conclude when posts become very long, popularity of the posts decrease !!!!
# "Page popularity/Likes" vs. "Share Counts"
axes2.plot(X_norm[:,0], X_norm[:,36], marker='o', markersize=5, lw= 0)
axes2.set_xlabel("Post Likes/Popularity")
axes2.set_ylabel("Post Share Count ")
# ======> We can conclude when posts populartiy increases, the amounts of share increases too.
# "Page talking about" vs. "Share Counts"
axes3.plot(X_norm[:,2], X_norm[:,36], marker='o', markersize=5, lw= 0)
axes3.set_xlabel("Page talking about")
axes3.set_ylabel("Post Share Count")
# ======> There is no good relation that make sense
# "Average of Comments ( Average of CC2, CC3, CC4 )" vs. "Page popularity/likes"
axes4.plot(X_norm[:,0],X_norm[:,53], marker='o', markersize=5, lw= 0)
axes4.set_xlabel("Page popularity/likes")
axes4.set_ylabel("Average of Comments ( Average of CC2, CC3, CC4 )")
# ======> We can conclude that when post popularity increase, the amounts of the comments increase too.
# "Average of Comments ( Average of CC2, CC3, CC4 )" vs. "Page popularity/likes"
axes5.plot(X_norm[:,35],X_norm[:,53], marker='o', markersize=5, lw= 0)
axes5.set_xlabel("Post Length")
axes5.set_ylabel("Average of Comments ( Average of CC2, CC3, CC4 )")
# ======> We can conclude when posts become very long, average numbers of the comments decrease !!!!
Out[321]:
In [322]:
fig = plt.figure()
axes1 = fig.add_axes([0, 2, 0.8, 0.8])
axes2 = fig.add_axes([1,2,0.8,0.8])
axes3 = fig.add_axes([0,1,0.8,0.8])
axes4 = fig.add_axes([1,1,0.8,0.8])
axes5 = fig.add_axes([0,0,1.8,0.8])
# Before Plotting the data, we should convert all of the columns relates to "Post Published Weekday" to pd.Series
weekday = matrixToNumber(df.T[39:46].T)
# plot "Post published weekday" Countplot
sns.countplot(weekday ,palette='viridis', ax= axes1)
axes1.set_xlabel("Published Weekday")
axes1.set_xticklabels(["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"])
# plot "H local" Countplot
axes2.set_yscale("log")
sns.countplot(df[38].astype('int') ,palette= 'viridis', ax= axes2)
axes2.set_xlabel("H Local")
# =====> We can see that majority of posts published at 24 H Local and Wednesday
# Plot "Promotion" Countplot
axes3.set_yscale("log")
sns.countplot(df[37] ,palette= 'viridis', ax= axes3)
axes3.set_xlabel("Promotion")
# =====> We can see that all of the posts promotion status is 0, So This column has no effect on our predections
# =====> So we can ignore this column
# Plot "Target Variable" Countplot
sns.countplot(df[54].astype('int'), palette= 'viridis', ax= axes4)
axes4.set_yscale("log")
axes4.set_xlabel("Target Varible")
# Plot "Category" Countplot
sns.countplot(df[3].astype('int'),palette= 'viridis', ax= axes5)
axes5.set_xlabel("Page Category")
# ======> We can see 9, 18 and 36 categorie's have occupied majority of posts
Out[322]:
In [323]:
# "Post published weekday" vs. "H local"
plt.plot(weekday,df[38], marker='o', markersize=5, lw= 0)
# =====> We can see except Wendsday and Thursday, most of the post's published in 24 H Local
Out[323]:
In [324]:
fig = plt.figure(figsize=(12,10))
axes1 = fig.add_axes([0, 2, 2, 1])
axes2 = fig.add_axes([0,1,2,1])
# Plot "Category" vs. "Popularity" barplot
axes1.set_yscale('log')
sns.barplot(x="3",y="0",data=df.rename(columns=(lambda x: str(x))).astype('int'), palette='viridis',ax=axes1)
axes1.set_xlabel("Page Category")
axes1.set_ylabel("Page Popularity/Likes")
# =====> We can conclude that "47", "61" and "33" has the majority of the popularity
# Plot "Category" vs. "Share amount" barplot
sns.barplot(x="3",y="36",data=df.rename(columns=(lambda x: str(x))).astype('int'), palette='viridis',ax=axes2)
axes2.set_xlabel("Page Category")
axes2.set_ylabel("Page Share Amounts")
# =====> We can conclude that "47", "61" and "33" has the majority of the Share Amounts
Out[324]:
In [325]:
# Using sklearn for spliting data into train_set and test_set
from sklearn.cross_validation import train_test_split
# Using Sklearn linear regression function for finding best thetas ( Linear Regression with multiple Varibles )
from sklearn.linear_model import LinearRegression
In [326]:
# Before Fitting model, We should get rid of columns that has no effect on our predections
X_norm = np.delete(X_norm, 37, 1)
In [327]:
# Spliting Normalized dataset into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.33, random_state=42)
In [328]:
# Initializing Model
lm = LinearRegression()
In [329]:
# Fitting data to model
lm.fit(X_train,y_train)
Out[329]:
In [330]:
# Model thetas
lm.coef_
Out[330]:
In [331]:
# Simplest way for evaluating model ( Squared Mean Error )
pred = lm.predict(X_test)
eval_arr = np.sum(np.power( pred - y_test , 2),axis=0) / len(pred)
In [332]:
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])
ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(pred, y_test,'bo')
# ====> We can conclude that our model works perfect
Out[332]:
In [333]:
# Evaluating model with sklearn functions
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
In [334]:
print('Mean Absolute Error:', mean_absolute_error(y_test, pred))
print('Mean Squared Error:',mean_squared_error(y_test, pred))
In [335]:
"Precision Of model: {}%". format( ( 1 - mean_absolute_error(y_test,pred) ) * 100 )
Out[335]:
In [336]:
plotLearningCurves(X_norm,y,500)
# Because There is no continous gap between Train Error and Test Error, So Our Model isn't suffering from high variance
# Becase as we get more data, both train and test errors are decreasing and are very small, So Our model isn't suffering from high bias
In [ ]: